1c07c3d38bc8aa873075799f9d4ba59075dd8f9a,src/edu/stanford/nlp/parser/dvparser/DVModel.java,DVModel,readWordVectors,#,501
Before Change
int chineseNumberCount = 0;
int chinesePercentCount = 0;
System.err.println("Reading in the word vector file: " + op.lexOptions.wordVectorFile);
int dimOfWords = 0;
boolean warned = false;
for (String line : IOUtils.readLines(op.lexOptions.wordVectorFile, "utf-8")) {
String[] lineSplit = line.split("\\s+");
String word = lineSplit[0];
if (op.wordFunction != null) {
word = op.wordFunction.apply(word);
}
dimOfWords = lineSplit.length - 1;
if (op.lexOptions.numHid <= 0) {
op.lexOptions.numHid = dimOfWords;
System.err.println("Dimensionality of numHid not set. The length of the word vectors in the given file appears to be " + dimOfWords);
}
// the first entry is the word itself
// the other entries will all be entries in the word vector
if (dimOfWords > op.lexOptions.numHid) {
if (!warned) {
warned = true;
System.err.println("WARNING: Dimensionality of numHid parameter and word vectors do not match, deleting word vector dimensions to fit!");
}
dimOfWords = op.lexOptions.numHid;
} else if (dimOfWords < op.lexOptions.numHid) {
throw new RuntimeException("Word vectors file has dimension too small for requested numHid of " + op.lexOptions.numHid);
}
double vec[][] = new double[dimOfWords][1];
for (int i = 1; i <= dimOfWords; i++) {
vec[i-1][0] = Double.parseDouble(lineSplit[i]);
}
SimpleMatrix vector = new SimpleMatrix(vec);
wordVectors.put(word, vector);
// TODO: factor out all of these identical blobs
if (op.trainOptions.unknownNumberVector &&
After Change
int chineseNumberCount = 0;
int chinesePercentCount = 0;
Map<String, SimpleMatrix> rawWordVectors = RNNUtils.readRawWordVectors(op.lexOptions.wordVectorFile, op.lexOptions.numHid);
for (String word : rawWordVectors.keySet()) {
SimpleMatrix vector = rawWordVectors.get(word);
if (op.wordFunction != null) {
word = op.wordFunction.apply(word);
}
if (op.lexOptions.numHid <= 0) {
op.lexOptions.numHid = vector.getNumElements();
}
// TODO: factor out all of these identical blobs